1. a specific suggestion for a plot: for each of your sampled beta vectors, normalize beta_t by its “largest” element (where largest means largest in absolute value, but you normalize by the actual value, not the absolute value, so that the normalized vector has maximum element +1). Then just plot a histogram of the resulting normalized beta values (pooled across all the samples). We should see very strong negative values are rare?

Here I use posterior means:

par(mfrow=c(1,2))
hist(as.vector(unlist(eznorm)),main="E(Z|Data)/E(Z|Data)[max(abs(E(Z|Data))]",freq=F,xlab="E(Z|Data)/E(Z|Data)[max(abs(E(Z|Data))]",ylim=c(0,2))
hist(as.vector(unlist(znorm)),main="Z/Z[max(abs(Z)]",freq=F,xlab="Z/Z[max(abs(Z)]",ylim=c(0,2))

Now, we might want to plot a biplot of these values vs the value used to normalize it:

R=ncol(posterior.means)
maxes=t(apply(posterior.means,1,function(x){
  rep(x[which.max(abs(x))],R)
}))

png("normstuff.png")
par(mfrow=c(1,2))
plot(eznorm,maxes,main="NormalizingZvsNormalized_PM",xlab="E(Z|data)_norm",ylab="NormalizingValue")
plot(znorm,maxes,main="NormalizingZvsNormalized_Zmle",xlab="Z_norm",ylab="NormalizingValue")
  1. as in ii) but normalize just by sign of largest element, rather than largest element. (that is, we just flip each beta vector so the largest element is positive). Again we don’ t expect to see large negative values. This preserves information about absolute size of beta, not just relative size.
het.sign=function(effectsize){
  t(apply(effectsize,1,function(x){
  x/sign(x[which.max(abs(x))])
}))}


ezsign=het.sign(effectsize = posterior.means)
zsign=het.sign(effectsize = z.stat)

par(mfrow=c(1,2))
hist(as.vector(unlist(ezsign)),main="E(Z|Data)/sign(E(Z|Data)[max(abs(E(Z|Data))])",freq=F,xlab="E(Z|Data)/sign(E(Z|Data)[max(abs(E(Z|Data))])",ylim=c(0,0.4))
hist(as.vector(unlist(zsign)),main="Z/Z[max(abs(Z)]",freq=F,xlab="Z/sign(Z[max(abs(Z)])",ylim=c(0,0.4))

Thresholding:

  1. we want to avoid pitfalls of thresholding. However, as long as we keep these pitfalls in mind it might be useful to give numbers from thresholding. (eg that histogram we made that showed how many tissues each eQTL was “significant” in)

Pull out tissue-specifics; examine examples.

Here’s an example:

j=which(rownames(z.stat)=="ENSG00000244171.3_3_142895174_T_G_b37")

barplot(as.matrix(maxz[j,]),las=2,cex.names=0.5,main=paste0("Z Statistics",rownames(maxz)[j]))

barplot(as.numeric(posterior.means[j,]),main=paste0("E(Z|EZ)",rownames(maxz)[j]),col=col.func(j,lfsr=lfsr,posterior.means=posterior.means),las=2,cex.names=0.8,ylab="PosteriorMean")

Let’s examine ENSG00000244171.3_3_142895174_T_G_b37 do discover his function:

pre-B-cell leukemia homeobox 2 pseudogene 1 [Source:HGNC Symbol;Acc:HGNC:8635]

We can find all the ones who have high loading on the ‘tissue specific’ configs:

pw.singles=post.weights[,sapply(covmat,function(x){sum(diag(x)!=0)==1})]
singleton.rank=rowSums(pw.singles)
j=which.max(singleton.rank)

par(mfrow=c(1,2))
barplot(as.matrix(maxz[j,]),las=2,cex.names=0.5,main=paste0("Z Statistics",rownames(maxz)[j]))
barplot(as.numeric(posterior.means[j,]),main=paste0("E(Z|EZ)",rownames(maxz)[j]),col=col.func(j,lfsr=lfsr,posterior.means=posterior.means),las=2,cex.names=0.8,ylab="PosteriorMean")

This particular gene is Desmoplakin, known to be involved in dilated cardiomyopathy.

pw.all=post.weights[,sapply(covmat,function(x){sum(diag(x)/max(diag(x)))/sum(x[1,2]!=0)==44})]
cons.rank=rowSums(pw.all[,6:22])
j=which.max(cons.rank)

par(mfrow=c(1,2))
barplot(as.matrix(maxz[j,]),las=2,cex.names=0.5,main=paste0("Z Statistics",rownames(maxz)[j]))
barplot(as.numeric(posterior.means[j,]),main=paste0("E(Z|EZ)",rownames(maxz)[j]),col=col.func(j,lfsr=lfsr,posterior.means=posterior.means),las=2,cex.names=0.8,ylab="PosteriorMean")

For each of your sampled beta vectors, normalize beta_t by its “largest” element (where largest means largest in absolute value, but you normalize by the actual value, not the absolute value, so that the normalized vector has maximum element +1). Then just plot a histogram of the resulting normalized beta values (pooled across all the samples). We should see very strong negative values are rare?

hetaugust=readRDS("~/Dropbox/Aug12/simulatedmvn.rds")
max.val=readRDS("~/Dropbox/Aug12/simulatedmaxval.rds")
par(mfrow=c(1,2))
hist(unlist(as.vector(hetaugust)),main="Simulated_Z/Z[,which.max(abs(Z))]",xlab="Znorms_simulatedfromPosterior",freq=F,ylim=c(0,1.5))
hist(unlist(as.vector(znorm)),main="Z/Z[,which.max(abs(Z))]",xlab="Znorm",freq=F,ylim=c(0,1.5))    

Here, we divide by the maximum sign value.

hetaugust=readRDS("~/Dropbox/Aug12/simulatedmvnsign.rds")
par(mfrow=c(1,2))
hist(unlist(as.vector(hetaugust)),main="Simulated_Z/Z[,which.max(abs(Z))]",xlab="Znorms_simulatedfromPosterior",freq=F,ylim=c(0,.5))


hist(unlist(as.vector(zsign)),main="Z/sign(Z[,which.max(abs(Z))])",xlab="Znorm",freq=F,ylim=c(0,0.5))    

Biplots of simulated values:

rep.col<-function(x,n){
   matrix(rep(x,each=n), ncol=n, byrow=TRUE)
}

bi_plot_funct=function(sim,max.val,sim.array){
  j=sim
  mat.col=max.val[,j]
  a=rep.col(max.val[,j],44)
  plot(sim.array[,j,],a,main="MaxValuevsSimulatedNormalizedValue",xlab="SimulatedArrayDividedbyMaxVal",ylab="MaximumValue")
  }


bi_plot_funcmaxz=function(max.val,znorm){
 a=rep.col(max.val,44)
  plot(znorm,a,main="MaxValuevsSimulatedNormalizedValue",xlab="SimulatedArrayDividedbyMaxVal",ylab="MaximumValue")
  }

For each simulation, we can make a biplot against the rvalue used to normalize the vector (the maxval).

hetsign=readRDS("~/Dropbox/Aug12/simulatedmvnsign.rds")
hetval=readRDS("~/Dropbox/Aug12/simulatedmvn.rds")
png("simulatedvsmax.png")
par(mfrow=c(1,2))
bi_plot_funct(sim=5,max.val = max.val,sim.array=hetsign)
bi_plot_funct(sim=5,max.val = max.val,sim.array=hetval)
dev.off()

z.max=apply(z.stat,1,function(x){x[which.max(abs(x))]})

png("mlevsmax.png")
par(mfrow=c(1,2))
bi_plot_funcmaxz(max.val=z.max,znorm = zsign)
bi_plot_funcmaxz(max.val=z.max,znorm = znorm)
dev.off()

Understanding patterns of sharing:

We can see that the majority of weight is on the non-single rank \(U_ks\). In fact, the third \(U_k\) which corresponds to the rank 3.

barplot(diag(covmat[[num.mat[22,3]]])/max(diag(covmat[[num.mat[22,3]]])),main="Prior Effect Size Sigma per Tissue of Most Common Covariance Matrix",names=colnames(z.stat),las=2,cex.names=0.5)

Even though the sqrt(maximum\(\omega\))is only 7, this captures the vast majority (over 95%) of the data, if we follow the principle that the max(\(\omega\)) should be roughly \(se(z)\)^2+\(z^2\).

dir=
heterogeneity=read.table("../Data/het.tablewithED.txt")[,1]
hist(heterogeneity,main="Proportion of Simulations with H0 False",freq=FALSE,xlab="Proportion of Simulations with Sign Heterogeneity")

sum(heterogeneity)
## [1] 10337.48
h=which(heterogeneity>0.5)
c=which(heterogeneity<0.5)
length(h)
## [1] 10769
length(c)
## [1] 5211
summary(unlist(as.vector(lfsr[h,])))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00213 0.07410 0.14610 0.26570 0.75990
summary(unlist(as.vector(lfsr[c,])))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000000 0.0000000 0.0000841 0.1320000 0.0345400 1.0000000
par(mfrow=c(1,2))
hist(unlist(as.vector(lfsr[h,])),main=NULL,ylab=NULL,xlab=NULL,freq=FALSE,ylim=c(0,14))

title(main="LFSRs of Inconsistent genes", col.main="red", 
   col.sub="blue", 
    xlab="LFSR", ylab="Density",
  col.lab="green", cex.lab=0.75,ylim=c(0,14))

hist(unlist(as.vector(lfsr[c,])),main=NULL,ylab=NULL,xlab=NULL,freq=FALSE)

title(main="LFSRs of Consistent genes", col.main="red", 
 col.sub="blue", 
   xlab="LFSR", ylab="Density",
  col.lab="green", cex.lab=0.75)

But we can see that the lfsrs of the heterogeneous SNPs are consistency larger than the lfsrs of the consistent SNP, meaning that our model will alert us to their uncertainty.

To get an idea of the presence of tissue specifics, we can look at the prior weight on the tissue specific configuraitons.

singleton.weights=pis[sapply(covmat,function(x){sum(diag(x)!=0)==1})]
sum(colSums(pi.mat[,10:53]))
## [1] 0.005241691

We might say that only 0.0052417 percent of genes demonstrate tissue specificity.

And how about the ‘completely consistent’?

#singleton.weights=pis[sapply(covmat,function(x){sum(diag(x)!=0)==1})]
colSums(pi.mat)[54]
## [1] 0.02572196

Some tissue specific examples:

library("qtlcharts")


 
plot_ts("Testis",lfsr, z.stat,0.05)
## Set screen size to height=700 x width=1000

plot_ts("Testis",lfsr, posterior.means,0.05)

plot_ts("Whole_Blood",lfsr, z.stat,0.05)

plot_ts("Whole_Blood",lfsr, posterior.means,0.05)

plot_ts("Muscle_Skeletal",lfsr,z.stat,0.05)

plot_ts("Liver",lfsr, z.stat,0.05)

plot_ts("Liver",lfsr, posterior.means,0.05)

plot_ts("Lung",lfsr, z.stat,0.05)

plot_ts("Lung",lfsr, posterior.means,0.05)

plot_ts("Lung",lfsr, z.stat,0.05)

plot_ts("Lung",lfsr, posterior.means,0.05)

plot_ts("Muscle_Skeletal" ,lfsr, z.stat,0.05)

plot_ts("Muscle_Skeletal" ,lfsr, posterior.means,0.05)

clean_names=sapply(rownames(maxz),function(x){ strsplit(x, "[.]")[[1]][1]})
liver=which(clean_names=="ENSG00000166923")
barfunc(genename = liver)

lung=which(clean_names=="ENSG00000151468")
barfunc(genename = lung)

testes=which(rownames(maxz)=="ENSG00000131848.5_19_56741808_G_A_b37")
##potassium channel subfamily M regulatory beta subunit 4
barfunc(genename = testes)

msk=(which(rownames(maxz)=="ENSG00000103316.6_16_21277438_C_T_b37"))
barfunc(genename = msk)

##crystallin

ms3=which(clean_names=="ENSG00000152601")
barfunc(genename = ms3)

##ENSG00000152601  muscleblind-like (Drosophila)


ms4=which(clean_names=="ENSG00000176658")
barfunc(genename = ms4)

##ENSG00000152601  muscleblind-like (Drosophila)

thyroid=(which(rownames(maxz)=="ENSG00000075275.12_22_46859003_G_C_b37"))
##cadherin, EGF LAG seven-pass G-type receptor 1
barfunc(genename = thyroid)

adrenal=(which(rownames(maxz)=="ENSG00000135643.4_12_70965479_A_G_b37"))

barfunc(genename = adrenal)

#potassium channel subfamily M regulatory beta subunit 4

wholebloodone=(which(rownames(maxz)=="ENSG00000156110.9_10_75928933_T_C_b37"))
barfunc(genename = wholebloodone)

#Adenosine kinase (HGNC Symbol)
#Entrez gene summary
#This gene an enzyme which catalyzes the transfer of the gamma-phosphate from ATP to adenosine, thereby serving as a regulator of concentrations of both extracellular adenosine and intracellular adenine nucleotides. Adenosine has widespread effects on the cardiovascular, nervous, respiratory, and immune systems and inhibitors of the enzyme could play an important pharmacological role in increasing intravascular adenosine concentrations and acting as anti-inflammatory agents. Multiple transcript variants encoding different isoforms have been found for this gene. [provided by RefSeq, Jan 2011]"

wholebloodtwo=(which(rownames(maxz)=="ENSG00000078114.14_10_21501314_T_C_b37"))


barfunc(genename = wholebloodtwo)

#Experiment name  Transcription profiling of human blood taken from individuals fed an antioxidant-rich diet or a diet containing kiwi-fruit to evaluate the effect on gene expression with emphasis on stress and repair related processes [E-MEXP-2030]
# Source    ArrayExpress
# Experiment description    Plant-based diets rich in fruit and vegetables can prevent development of several chronic age-related diseases such as cancer and cardiovascular diseases. The mechanisms behind this protective effect is not elucidated. We have studied whether a specially designed antioxidant-rich food diet and a kiwi-fruit intervention can affect whole genome expression in human blood cells with emphasis on stress and repair related process.
# Chips 
# Chip ID: OAS12-1 - Annotation by Bgee curators: Anatomical structure ID: EV:0100047 - Developmental stage ID: HsapDO:0000148
# Probeset ID: 203961_at - Mapped to gene: ENSG00000078114 - Detection flag: absent - Quality: high quality

We also denoise and shrink the consistent effects away from 0.

plot_tc(lfsr,curvedata = maxz,thresh = 0.00005)

plot_tc(lfsr,curvedata = posterior.means,thresh = 0.00005)

Power compariosn

sum(lfsr<0.05)
## [1] 393414
p.vals=2*(1-pnorm((as.matrix(unlist(abs(z.stat))))))
hist(p.vals,freq=FALSE)
abline(h=1)
library('qvalue')

qs=qvalue(p.vals)
sum(qs$qvalues<0.05)
## [1] 202087

Also,

Now, for each QTL I want to examine the posterior means of the tissues in which it was ‘active’ (at an lfsr threshold) and find QTL whose sign differed in at least 2 significant QTL. We restrict our analysis to only those gene snp pairs who show significance in at least one tissue:

l=apply(lfsr,1,function(x){sum(x<0.05)})
sum(l!=0)
## [1] 14146

Here we can see that is `r sum(l!=0)’ genes. Then we look to see if the posterior means differed in sign.

Now, for each QTL I want to examine the posterior means of the tissues in which it was ‘active’ (at an lfsr threshold) and find QTL whose sign differed in at least 2 significant QTL.

thresh=0.05
s=sapply(seq(1:nrow(posterior.means)),function(x){
l=lfsr[x,];p=posterior.means[x,];plow=p[which(l<thresh)];##grab only those posterior means that are 'significant'
if(length(plow)==0){return("FALSE")}##for ones who show no significants, they can't be heterogenous
else{pos=sum(plow>0);neg=sum(plow<0);pos*neg!=0}})
sum(s=="TRUE")
## [1] 3180
hets=which(s=="TRUE")

Now, for each snp, simulate 100 draws from mvnorm according to posterior weights; count proportion of times signs differ. This takes about 2 hours .

heterogeneity=sapply(seq(1:nrow(maxz)),function(j){sim.array.generation(j,b.j.hat=maxz,s.j,covmat,pis,sim=100)})
write.table(heterogeneity,"heterogeneityindex.txt",row.names=rownames(maxz))

We can examine the distribution of heterogeneity.

heterogeneity=read.table("../Data/het.tablewithED.txt")[,1]
hist(heterogeneity,main="Proportion of Simulations with H0 False",freq=FALSE,xlab="Proportion of Simulations with Sign Heterogeneity")

sum(heterogeneity)
## [1] 10337.48

If we compare with original z statistics:

zhets=apply(z.stat,1,function(dat){pos=sum(dat>0);neg=sum(dat<0);pos*neg!=0})
sum(zhets)
## [1] 14557

```